{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1b6fe0c1-931e-4194-bcfe-0716d8f75b50",
   "metadata": {},
   "source": [
    "# Youtube Video Summarization\n",
    "\n",
    "## My First Frontier LLM Project!\n",
    "\n",
    "Welcome to my first LLM-based project! The goal of this project is to leverage large language models (LLMs) to summarize YouTube videos. Currently, it only supports English transcriptions, so instead of watching the entire video, you can simply read the summary!\n",
    "\n",
    "## Important Note\n",
    "Be mindful when testing with longer videos, as they may consume significant resources and could lead to high costs on your ChatGPT bill.\n",
    "You can switch to Ollama for free usage if you're looking to reduce costs.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install youtube-transcript-api openai"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a082ddaf-abf5-4e6c-8112-74846c768301",
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "\n",
    "import os\n",
    "\n",
    "import requests\n",
    "from dotenv import load_dotenv\n",
    "from IPython.display import Markdown, display\n",
    "\n",
    "from openai import OpenAI\n",
    "from youtube_transcript_api import YouTubeTranscriptApi\n",
    "import re\n",
    "\n",
    "# If you get an error running this cell, then please head over to the troubleshooting notebook!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load environment variables in a file called .env\n",
    "\n",
    "load_dotenv(override=True)\n",
    "api_key = os.getenv('OPENAI_API_KEY')\n",
    "\n",
    "# Check the key\n",
    "\n",
    "if not api_key:\n",
    "    print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
    "elif not api_key.startswith(\"sk-proj-\"):\n",
    "    print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
    "elif api_key.strip() != api_key:\n",
    "    print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
    "else:\n",
    "    print(\"API key found and looks good so far!\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "openai = OpenAI()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5e793b2-6775-426a-a139-4848291d0463",
   "metadata": {},
   "outputs": [],
   "source": [
    "class YoutubeVideoID:\n",
    "    def __init__(self, url):\n",
    "        self.url = url\n",
    "        self.video_id = self.extract_video_id(url)\n",
    "\n",
    "    def extract_video_id(self, url):\n",
    "        \"\"\"\n",
    "        Extracts the YouTube video ID from a given URL.\n",
    "        Supports both regular and shortened URLs.\n",
    "        \"\"\"\n",
    "        # Regular expression to match YouTube video URL and extract the video ID\n",
    "        regex = r\"(?:https?:\\/\\/)?(?:www\\.)?(?:youtube\\.com\\/(?:[^\\/\\n\\s]+\\/\\S+\\/|\\S*\\?v=)|(?:youtu\\.be\\/))([a-zA-Z0-9_-]{11})\"\n",
    "        match = re.match(regex, url)\n",
    "        \n",
    "        if match:\n",
    "            return match.group(1)\n",
    "        else:\n",
    "            raise ValueError(\"Invalid YouTube URL\")\n",
    "\n",
    "    def __str__(self):\n",
    "        return f\"Video ID: {self.video_id}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ef960cf-6dc2-4cda-afb3-b38be12f4c97",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Example usage\n",
    "video_url = \"https://www.youtube.com/watch?v=kqaMIFEz15s\"\n",
    "\n",
    "yt_video = YoutubeVideoID(video_url)\n",
    "print(yt_video)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f724be3c-bdeb-4079-b4be-f12608144484",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_transcript(video_id, language='en'):\n",
    "    try:\n",
    "        # Try to get the transcript in the desired language (Indonesian by default)\n",
    "        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])\n",
    "        # Join all the 'text' fields into a single string\n",
    "        return \" \".join([item['text'] for item in transcript])\n",
    "    except Exception as e:\n",
    "        print(f\"Error fetching transcript: {e}\")\n",
    "        return None\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12e302fa-f564-4ec6-a08f-b3b3ce549396",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fetch transcript using the video ID\n",
    "transcript_text = get_transcript(yt_video.video_id)\n",
    "print(len(transcript_text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a0750be-88a1-4e65-9cb8-a0a2f11eecdf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to summarize text using ChatGPT\n",
    "def summarize_text(text):\n",
    "    try:\n",
    "        system_prompts = \"\"\"\n",
    "        You are a helpful assistant who provides concise and accurate summaries of text. Your task is to:\n",
    "        \n",
    "        - Capture the key points of the content.\n",
    "        - Keep the summary brief and easy to understand.\n",
    "        - Avoid summarizing overly lengthy texts or breaking them into excessively short summaries.\n",
    "        - Use bullet points where appropriate to enhance clarity and structure.\n",
    "        \"\"\"\n",
    "        response = openai.chat.completions.create(\n",
    "            model=\"gpt-4o-mini\",\n",
    "            messages=[\n",
    "                {\"role\": \"system\", \"content\": system_prompts},\n",
    "                {\"role\": \"user\", \"content\": f\"Summarize the following text:\\n{text}\"}\n",
    "            ],\n",
    "            max_tokens=200\n",
    "        )\n",
    "        return response.choices[0].message.content\n",
    "    except Exception as e:\n",
    "        print(f\"Error summarizing text: {e}\")\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad646bc4-a11a-4c44-b941-54befdbf9bc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_text(text, chunk_size=3000):\n",
    "    \"\"\"\n",
    "    Splits large text into smaller chunks based on the given chunk size.\n",
    "    Ensures that chunks end with a full stop where possible to maintain sentence integrity.\n",
    "    \n",
    "    :param text: str, the text to be split\n",
    "    :param chunk_size: int, maximum size of each chunk (default 3000 characters)\n",
    "    :return: list of str, where each str is a chunk of text\n",
    "    \"\"\"\n",
    "    chunks = []\n",
    "    while len(text) > chunk_size:\n",
    "        # Find the last full stop within or at the chunk size\n",
    "        split_point = text.rfind('.', 0, chunk_size + 1)  # +1 to include the period itself if it's at chunk_size\n",
    "        if split_point == -1:  # No period found within the chunk size\n",
    "            split_point = chunk_size\n",
    "        \n",
    "        # Append the chunk, ensuring we don't strip spaces that might be part of the sentence structure\n",
    "        chunks.append(text[:split_point + 1] if split_point != chunk_size else text[:chunk_size])\n",
    "        text = text[split_point + 1:] if split_point != chunk_size else text[chunk_size:]\n",
    "    \n",
    "    # Add the remaining text as the final chunk, only strip if there's content\n",
    "    if text:\n",
    "        chunks.append(text.strip())\n",
    "    \n",
    "    return chunks\n",
    "\n",
    "transcript_chunks = split_text(transcript_text)\n",
    "\n",
    "# Now you can summarize each chunk individually\n",
    "summaries = []\n",
    "for chunk in transcript_chunks:\n",
    "    summary = summarize_text(chunk)\n",
    "    summaries.append(summary)\n",
    "\n",
    "\n",
    "# Combine the individual summaries into one\n",
    "full_summary = \" \".join(summaries)\n",
    "display(Markdown(full_summary))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b266fdc-da31-4d79-8982-be77f03be59f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "792c814d-73f8-4c1e-a0bb-b654b40e4d8b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
